# pip install PyMuPDF # (install PyMuPDF for extracting info from PDF files)
# pip install tika # (install tika for extracting paragraphs from PDF files)
# pip install spacy==2.2.0 # (install spacy for lemmatization)
# conda install gensim # (intall gesim for topic modelling)
# pip install pyLDAvis # (install pyLDAvis for topic modelling visulisation)
# conda install -c conda-forge pyldavis # (if you use aconda to install pyLADvis)
import pandas as pd
import numpy as np
import nltk; nltk.download('stopwords')
from nltk.corpus import stopwords # import stop words
stop_words = stopwords.words('english')
import re
from pprint import pprint
# glob for extracting the directories of metadata
import glob
# PyMuPDF
import fitz
# tika
import tika
from tika import parser
# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
# spacy for lemmatization
import spacy
# Visualisation
import plotly.express as px
import pyLDAvis
import pyLDAvis.gensim_models
import matplotlib.pyplot as plt
%matplotlib inline
# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)
import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)
import os
# Extract the directories of the PDF files
pdf_dir = "D:\LEON\Business Analytics\Study\9. Business Project\Data set\Crossrail"
pdf_files = glob.glob("%s/*.pdf" % pdf_dir)
pdf_files[3]
# Use PyMuPDF to extract all info of the PDF files (text, title, date, etc)
list_metadata = []
for i in pdf_files:
with fitz.open(i) as doc:
info = doc.metadata
info['file_name'] = os.path.basename(i)
text = ''
for page in doc:
text+= page.getText()
info['Content'] = text
list_metadata.append(info)
df = pd.DataFrame(list_metadata)
df['document_id'] = df.index
df.head(3)
df = df.drop_duplicates(subset = 'Content') # drop duplicate rows
df = df.dropna(subset=['Content']) # drop rows whose text content is NaN
df['Word_count'] = df ['Content'].str.count(' ') + 1
df.info()
# Word count
df['Word_count'].sum( )
# Word count distribution
#import seaborn as sns
#ax1 = sns.distplot(df['Word_count'])
#ax1.set(title = 'Word Count Distribution',
# xlabel = 'Word Count of Each Document');
data = df.Content.values.tolist()
def sent_to_words(sentences):
for sentence in sentences:
yield(gensim.utils.simple_preprocess(str(sentence).encode('utf-8'), deacc=True)) # deacc=True removes punctuations
data_words= list(sent_to_words(data))
Remove Stopwords, Make Bigrams and Trigrams,Lemmatisation, remove short words and meaningless words
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]
def make_bigrams(texts):
return [bigram_mod[doc] for doc in texts]
def make_trigrams(texts):
return [trigram_mod[bigram_mod[doc]] for doc in texts]
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
"""https://spacy.io/api/annotation"""
texts_out = []
for sent in texts:
doc = nlp(" ".join(sent))
texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
return texts_out
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)
# Form Bigrams
# data_words_bigrams = make_bigrams(data_words_nostops)
# Form Trigrams
data_words_trigrams = make_trigrams(data_words_nostops)
# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
#increase the maximum length of text that the parser or NER can process
nlp.max_length = 13000000 #
# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized1 = lemmatization(data_words_trigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])
print(data_lemmatized1[:1])
# Set the length of word threshold for removing the words less than the threshold
minimum_len = 4 #
data_lemmatized = []
for i in data_lemmatized1:
new_element = [x for x in i if len(x) >= minimum_len]
data_lemmatized.append(new_element)
print(data_lemmatized[:1])
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)
# Create Corpus
texts = data_lemmatized
# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]
# View
print(corpus[:1])
# set training parameters
k = 20
passes = 20
iterations = 100
alpha = 50.0/k
eta = 0.01
random_state = 12345
minimum_probability = 0
# create the function for computing the coherence score of different models with different number of topics.
def compute_coherence_values(dictionary, corpus, texts, limit, start, step):
coherence_values = []
model_list = []
for k in range(start, limit, step):
model = gensim.models.LdaModel(num_topics=k, corpus=corpus, id2word=id2word, alpha=alpha, eta=eta,
iterations=iterations, passes=passes, random_state = random_state, minimum_probability = minimum_probability)
model_list.append(model)
coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
coherence_values.append(coherencemodel.get_coherence())
return model_list, coherence_values
# apply the function, it might take a long time.
#limit=80; start=0; step=5;
#model_list, coherence_values = compute_coherence_values(dictionary=id2word, corpus=corpus, texts=texts, start=start, limit=limit, step=step)
# plot the coherence score against number of topics
#x = range(start, limit, step)
#list_num_topics = [i for i in x]
#df_coherence = pd.DataFrame({'Number_of_Topics': list_num_topics, 'Coherence_Score': coherence_values})
#fig1 = px.line(df_coherence, x = 'Number_of_Topics', y="Coherence_Score", title = 'Coherence score against number of topics')
#fig1.update_layout(autosize=False, width=1000, height=400)
#fig1.update_traces(mode = "lines + markers")
#fig1.show()
# num of topics = 35 to get the optimal coherence socre
k = 15
lda_model = gensim.models.LdaModel(
corpus = corpus,
id2word = id2word,
alpha = alpha,
eta = eta,
iterations = iterations,
num_topics = k,
passes = passes,
random_state = 12345,
minimum_probability = minimum_probability)
# create the funtion for extraction of paragraphs by splitting the documents by new lines
def para_split(i):
if '\n \n' in i:
return i.split('\n \n')
else:
return i.split('\n\n')
list_paragraphs = []
list_para_id = []
for i in pdf_files:
j = parser.from_file(i)
m = j['content']
para = para_split(m)
para = [w.replace('\n', '') for w in para]
para = [x.strip() for x in para if x.strip()] # remove empty elements
para_id = [x for x in range(len(para))]
list_paragraphs.append(para)
list_para_id.append(para_id)
df_para1 = df.copy()
df_para1['paragraphs'] = list_paragraphs
df_para1['para_id'] = list_para_id
df_para2 = df_para1.apply(pd.Series.explode)
df_para3 = df_para2.reset_index()
df_para4 = df_para3[['creationDate', 'document_id', 'file_name', 'para_id', 'paragraphs']]
df_para4
I applied the 148,652 paragraphs above to classfiy their topics, but it's quite time-consuming and I hadn't got the result after 2 hours. So as shown below I provide users a threshold n for selecting the praragraphs with more than n words for classfication. Here I set n = 30, and the number of paragraph to be classified decreased to 52,697, it takes about half and hour to get the result.
n_word_count = 30 # set the threshold of word count
para_word_count = df_para4['paragraphs'].str.split().str.len() # word count of each paragraph
df_para = df_para4[(para_word_count>=n_word_count)].reset_index() # select only the paragraphs with word count above the threshold
df_para
# tokenization
data2 = df_para.paragraphs.values.tolist()
data_words2 = list(sent_to_words(data2))
# Remove Stop Words
data_words_nostops2 = remove_stopwords(data_words2)
# Form Trigrams
data_words_trigrams2 = make_trigrams(data_words_nostops2)
# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized2 = lemmatization(data_words_trigrams2, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])
# set the length of word threshold for removing the words less than the threshold
minimum_len = 4
data_lemmatized2_1 = []
for i in data_lemmatized2:
new_element = [x for x in i if len(x) >= minimum_len]
data_lemmatized2_1.append(new_element)
# create the function for converting a list of tuples into a dictionary
def Convert(tup, di):
di = dict(tup)
return di
# belong function: classify topics of paragraphs, it might take a long time because there are 148,651 paragraphs in the 11,132,849-word corpus
list_topic_para = []
dictionary_topic_para = {}
for d in data_lemmatized2_1:
bow = id2word.doc2bow(d)
belong = lda_model[bow]
doc_dic = Convert(belong, dictionary_topic_para)
list_topic_para.append(doc_dic)
df_topic_para = pd.DataFrame(list_topic_para)
# topic distribution across paragraphs
df_topic_para1_1 = pd.merge(df_para, df_topic_para, how = 'left', left_index=True, right_index=True)
df_topic_para1_1
# save the result to disk
df_topic_para1_1.to_pickle('./df_topic_para1.pkl')
# load the result from disk
df_topic_para1 = pd.read_pickle('./df_topic_para1.pkl')
df_topic_para1_n = df_topic_para1.copy()
df_topic_para1_n['highest_p'] = df_topic_para1_n.iloc[:, 6:].max(axis = 1) # get the highest probability among the topic distribution of each paragraph
df_topic_para1_n['salient_topic'] = df_topic_para1.iloc[:, 6:].idxmax(axis = 1) # get the corresponding topic id
df_topic_para1_n = df_topic_para1_n[['file_name', 'para_id', 'paragraphs', 'salient_topic', 'highest_p']]
# highest 5 ranked paragraphs overall
df_topic_para1_n.nlargest(5,['highest_p'])
# define the function for extracting the highest N ranked paragraphs from each topic
def top_n_filter(df, top_n):
list_topic_id = [x for x in range(0,k)]
list_n_para = []
list_n_p = []
for x in range(0, k):
n_para = [i for i in df.nlargest(top_n, [x])['paragraphs']]
n_p = [i for i in df.nlargest(top_n, [x])[x]]
list_n_para.append(n_para)
list_n_p.append(n_p)
pd_n_para = pd.DataFrame({'topic_id': list_topic_id, 'salient_paragraph': list_n_para, 'probability': list_n_p})
return(pd_n_para.apply(pd.Series.explode))
# highest 2 ranked paragraphs from each topic
top_n_filter(df_topic_para1, 2)
df_n_topic_k = top_n_filter(df_topic_para1, 2)
topic_id_chosen = 7 # choose the topic ID
num_para = 2 # set N
topic_id_filter = df_n_topic_k['topic_id'] == topic_id_chosen
df_n_topic_k[topic_id_filter]
# selecting the paragraphs where the belong() function is greater than the threshold for M topics at a time
threshold = 1/3 # set threshold
topic_filter = df_topic_para1.iloc[:, 5:].max(axis=1) > threshold # set filter
df_topic_para_M = df_topic_para1[topic_filter] # extract the qualified paragraphs
df_topic_para_M
# Highest 2 ranked paragraphs where the belong() function is greater than the threshold for M topics at a time
top_n_filter(df_topic_para_M, 2)
pprint(lda_model.print_topics())
# topic distribution over documents
list_topic = []
dictionary_topic = {}
for d in texts:
bow = id2word.doc2bow(d)
belong = lda_model[bow] # generate a list of tuples of topic distribution of a document
belong_dic = Convert(belong, dictionary_topic) # convert the list of tuples into a dictionary
list_topic.append(belong_dic)
df_topic_distribution = pd.DataFrame(list_topic) # convert the list of dictionaries into a dataframe
df_topic = pd.merge(df, df_topic_distribution, how = 'left', left_index=True, right_index=True) # merge with info of documents
df_topic.drop(['title','format','creator', 'producer', 'keywords', 'trapped', 'encryption','subject', 'modDate'], axis = 1)
As shown below there are only 8 topics with topic distribution above the threshold (1/K); according to the PTBI proposed by Marchetti and Puranam (2020), the 8 topics are the salient topics worth interpreting. However, we extracted 35 topics and the 8 topics are not enough to interpret all the documents, so we prefer to interpret all the topics
topic_distribution = df_topic_distribution.sum()/df_topic_distribution.sum().sum()
topics_distribution = pd.DataFrame({'topic_id': topic_distribution.index, 'topic_distribution': topic_distribution,
'Not_less_than_threshold': topic_distribution >= 1/k})
topics_distribution.sort_values(by = 'topic_distribution', ascending = False)
fig2 = px.bar(topics_distribution, x='topic_id', y='topic_distribution',title='Topic distribution of the whole corpus')
fig2.update_layout(autosize=False, width=1000, height=300)
fig2.show()
To interpret the topics, I combined the word frequncy demonstrated by pyLDAvis with prototypical documents or paragraphs suggested by PTBI proposed by Marchetti and Puranam (2020)
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word, sort_topics = False )
vis
I followed the method of extraction of prototypical text suggested by PTBI proposed by Marchetti and Puranam (2020). Its heart lies in that for parameter L (probability that a document belongs to a topic), at least 1/L documents with probablity that they belong to the topic >= L are needed to interpret the topics, this method tries to find out the opitimal value of L to maximize the percentage of interpretable topics (Marchetti and Puranam, 2020, p. 20)
List_num_doc = [x for x in range(1, 20, 1)] # generate a list of 1/L (minimum number of documents to interpret a topic)
list_L = [1/x for x in List_num_doc] # generate a list of L
# create the function for computing the percentage of potentially interpretable topics against parameter L
def perc(i, df):
list_num_topics = []
for j in df:
topic_filter = df[j] >= i
m = df[j][topic_filter].count()
list_num_topics.append(m)
count1 = sum(map(lambda x : x >= 1/i, list_num_topics))
perc1 = count1 / k
return(perc1)
The following chart shows that the percentage of potentially interpretable topics for “high enough” levels of L is not large enough, so the paragraph-based interpretation can be explored.
list_perc1 = []
for i in list_L:
num = perc(i, df_topic_distribution)
list_perc1.append(num)
df_L1 = pd.DataFrame({'Parameter L': list_L, 'Percentage of interpretable topics': list_perc1})
fig_L1 = px.line(df_L1, x = 'Parameter L', y="Percentage of interpretable topics", title = 'Value selection for parameter L (document-based)')
fig_L1.update_layout(autosize=False, width=1200, height=400)
fig_L1.update_traces(mode = "lines + markers")
fig_L1.show()
The following chart shows that when L = 0.5, the the percentage of interpretable topics is 86.7%, so we set L = 0.5 - ie, each topic needs at least 2(1/L) paragraphs to be interpreted.
df_topic_para
#df_topic_para2 = df_topic_para1.drop(['document_id', 'paragraphs'], axis = 1)
list_perc2 = []
for i in list_L:
num = perc(i, df_topic_para)
list_perc2.append(num)
df_L2 = pd.DataFrame({'Parameter L': list_L, 'Percentage of interpretable topics': list_perc2})
fig_L2 = px.line(df_L2, x = 'Parameter L', y="Percentage of interpretable topics", title = 'Value selection for parameter L (paragraph-based )')
fig_L2.update_layout(autosize=False, width=1200, height=400)
fig_L2.update_traces(mode = "lines + markers")
fig_L2.show()
# tokenization
#data2 = df_para.paragraphs.values.tolist()
#data_words2_2 = list(sent_to_words(data2))
# set the length of word threshold for removing the words less than the threshold
#minimum_len = 4
#data_words2 = []
#for i in data_words2_2:
new_element = [x for x in i if len(x) >= minimum_len]
data_words2.append(new_element)
# Bigram & Trigram
#bigram2 = gensim.models.Phrases(data_words2, min_count=5, threshold=100) # higher threshold fewer phrases.
#trigram2 = gensim.models.Phrases(bigram2[data_words2], threshold=100)
#bigram_mod2 = gensim.models.phrases.Phraser(bigram2)
#trigram_mod2 = gensim.models.phrases.Phraser(trigram2)
# Remove Stop Words
#data_words_nostops2 = remove_stopwords(data_words2)
# Form Trigrams
#data_words_trigrams2 = make_trigrams(data_words_nostops2)
# Do lemmatization keeping only noun, adj, vb, adv
#data_lemmatized2 = lemmatization(data_words_trigrams2, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])
# Create Dictionary
#id2word2 = corpora.Dictionary(data_lemmatized2)
# Create Corpus
#texts2 = data_lemmatized2
# Term Document Frequency
#corpus2 = [id2word2.doc2bow(text) for text in texts]
#lda_model2 = gensim.models.LdaModel(
# corpus=corpus2,
# id2word=id2word2,
# alpha=alpha,
# eta=eta,
# iterations=iterations,
# num_topics=k,
# passes=passes)
# Compute Coherence Score
#coherence_model_lda2 = CoherenceModel(model=lda_model2, texts=data_lemmatized2, dictionary=id2word2, coherence='c_v')
#coherence_lda2 = coherence_model_lda2.get_coherence()
#print('\nCoherence Score: ', coherence_lda2)
# Visualize the topics
#vis2 = pyLDAvis.gensim_models.prepare(lda_model2, corpus2, id2word2, sort_topics = False)
#vis2